This dataset is from Kaggle

Prepare for analysis

setwd("~/../Desktop/Spring2019/MA681_RENEW_PROJECT/SCRIPT/")
suppressMessages(suppressWarnings(library(tidyverse)))
suppressMessages(suppressWarnings(library(corrplot)))
tidied_data <- readRDS("crime_data.rds")
distric_name <- c("DOWNTOWN AND CHARLESTOWN-1","DOWNTOWN AND CHARLESTOWN-2",
                  "EAST BOSTON","ROXBURY","MATTAPAN","DORCHESTER",
                  "SOUTH BOSTON","BRIGHTON","SOUTH END","JAMAICA PLAIN",
                  "HYDE PARK","WEST ROXBURY","no report")
distric_code <- sort(unique(as.character(tidied_data$DISTRICT)))
tidied_data$district_name <- apply(tidied_data, 1, FUN = function(x){
  id <- which(x[2] == distric_code)
  return(distric_name[id])
})

Total number of crimes

tidied_data %>% group_by(crime_date) %>%
  summarize(Occurrenes = n()) %>%
  ggplot(aes(x = as.Date(crime_date),y = Occurrenes, group = 1)) + 
    geom_line()+
    scale_x_date(breaks=as.Date(c("2015-06-15","2016-04-12","2017-02-09","2017-12-06","2018-10-03")))+
    xlab("Crime date")

Number of crimes in different districs in different months

tidied_data %>% group_by(YEAR, FIXED_MONTH, district_name) %>%
  filter(!((FIXED_MONTH == "06" & YEAR == 2015) | 
           (FIXED_MONTH == "10" & YEAR == 2018))) %>%
  summarize(Occurrences = n()) %>%
  ggplot(aes(x = as.factor(paste(YEAR,FIXED_MONTH,sep = "-")),y = Occurrences, group = district_name,color = district_name)) + 
  geom_line()+
  theme(axis.text.x = element_text(angle = 90),
        legend.text =  element_text(size = 7))+
  xlab("Year-Month")

Number of crimes in different districs in different week days.

tidied_data %>% 
  filter(YEAR %in% c(2016,2017)) %>%
  group_by(FIXED_MONTH, DAY_OF_WEEK, district_name) %>%
  summarize(Occurrences = n()) %>%
  ggplot(aes(x = as.factor(paste(FIXED_MONTH,DAY_OF_WEEK,sep = "-")),y = Occurrences, group = district_name,color = district_name)) + 
  geom_line()+
  theme(axis.text.x = element_text(angle = 90, size = 5.5),
        legend.text =  element_text(size = 7))+
  xlab("Month-Day")

Boxplot for Occurences in each month

x <- tidied_data %>% 
  filter(YEAR %in% c(2016,2017)) %>%
  group_by(MONTH, crime_date) %>%
  summarize(Occurrences = n())

day_mean <- mean(x$Occurrences)

x %>%
  ggplot(aes(x = MONTH,y = Occurrences)) + 
    geom_boxplot(aes(fill = MONTH))+
    geom_hline(yintercept = day_mean, linetype = 2, colour = "black", size = 1.5)+ # mean of day crime occurrences
    theme(axis.text.x = element_text(angle = 90))+
    xlab("Month")

Top crimes type in Huntington Avenue.

tidied_data %>%
  group_by(STREET,OFFENSE_CODE_GROUP) %>%
  summarize(Occurrcens = n()) %>%
  filter((STREET == "HUNTINGTON AVE") &
           (Occurrcens >= sort(Occurrcens,decreasing = T)[10])) %>%
  ggplot(aes(OFFENSE_CODE_GROUP, Occurrcens, fill = OFFENSE_CODE_GROUP))+
    geom_bar(stat = "identity")+
    theme(axis.text.x = element_blank()) +
    xlab("Crime types")

Comparison between crime types in streets.

tidied_data %>%
  mutate(Crime_types = ifelse(OFFENSE_CODE_GROUP %in% c("Larceny","Larceny From Motor Vehicle","Simple Assault","Aggravated Assault"),OFFENSE_CODE_GROUP, "Others")) %>%
  group_by(STREET,Crime_types) %>%
  summarize(Occurrcens = n()) %>%
  filter((STREET %in% c("HUNTINGTON AVE","BOYLSTON ST","COLUMBUS AVE","MASSACHUSETTS AVE","NEWBURY ST")) &
           (Crime_types != "Others")) %>%
  ggplot(aes(STREET, Occurrcens, fill = Crime_types))+
  geom_bar(stat = "identity")+
  coord_flip()+
  theme(legend.position = "right")+
  xlab("Crime types")

Correlation between different types of crimes

tidied_data %>%
  group_by(crime_date,OFFENSE_CODE_GROUP) %>%
  summarize(Counts = n()) %>%
  spread(OFFENSE_CODE_GROUP,Counts) %>%
  remove_rownames() %>%
  column_to_rownames("crime_date") %>%
  apply(2,FUN = function(x){
    x[is.na(x)] <-  0
    return(x)
  }) %>%
  cor() %>%
  corrplot(type = "upper", method = "ellipse", tl.col = "black", tl.srt = 45, number.cex = .75,tl.cex = .40,outline = FALSE)

Locations for crime using the longitude and latitude.

temp <- tidied_data %>%
  na.omit() %>%
  filter(Lat != -1 & Long != -1)

temp %>%  
  ggplot(aes(x = Long, y = Lat, color = district_name)) +
    geom_point(alpha = .1) + 
    guides(colour = guide_legend(override.aes = list(alpha = 1))) +
    theme(panel.background = element_rect(fill = "white"),
                                  panel.grid = element_line(color = "black"))